#' Country-date
#'
#' Checks whether a given string is properly formatted as a
#' country-date string.
#'
#' @param x Character vector
#'
#' @details A country-date string consists of a 1-3 lettered V-Dem
#'     country identifier, \code{country_text_id}, concatenated with a
#'     space to a year-month-day date, \code{historical_date}. The
#'     resulting string is mostly used as the rownames of matrices in
#'     order to identify each country-date observation.
#'
#'     Ideally, we would create a \code{country_date} S3 class so that
#'     we can avoid the performance cost of constantly checking
#'     whether our country-date strings are properly
#'     formatted. Unfortunately, while R allows character vectors with
#'     additional attributes to be set as the rownames of matrices,
#'     when subsetting a matrix the resulting rownames loses all of
#'     those attributes. We can partially avoid the performance
#'     problem in a less elegant with memoisation.
#'
#' @section Warning: No attempt is currently made to ensure that the
#'     date portion of a country-date string is a valid date beyond
#'     checking the number of digits.
#'
#'     \code{is_country_date} will also always return \code{FALSE} for
#'     missing values; however, \code{NULL} will return
#'     \code{logical(0)}.
#'
#' @return Logical vector.
#'
#' @examples
#' is_country_date(c("AFG 1900-01-01", "pilot_1_v1", "BFD 2001"))
#'
#' @family country-date functions
#' @export
is_country_date <- function(x) {
    grepl("^[A-Z]{3}\\s\\d{4}-\\d{2}-\\d{2}$", x, perl = T)
}

assert_str <- function(x) {
    if (any(!is_country_date(x)))
        stop("Invalid country-date string", call. = F)
}


#' Vignette ratings
#'
#' Indicates whether a country-date represents a vignette.
#'
#' @param x Character vector of country-dates
#' @param na.rm Whether to set \code{NA} values to false.
#'
#' @details Vignettes are identified by unique constructed from the
#'     vignette type, threshold, and version.  Note, we merge
#'     historical vignettes into contemporary when the question text
#'     hasn't changed --- which so far includes contemporary pilot
#'     vignettes only --- meaning that the raw data contained in a
#'     vignette row in a V-Dem country-date data matrix may not match
#'     directly to the raw data in our database.
#'
#' @return Logical vector
#'
#' @examples
#' is_vignette(c("AFG 1900-01-01", "historical_1_v2", "pilot_1_v1"))
#'
#' @family country-date functions
#'
#' @export
is_vignette <- function(x, na.rm = T) {
    b <- grepl("^(pilot|historical|contemporary)_\\d_v\\d", x)

    if (!isTRUE(na.rm))
        b[is.na(x)] <- NA

    b
}

#' Extract date or country from country-date
#'
#' Extracts the individual components from a country-date string
#'
#' @param x Character Vector
#'
#' @details Country-date strings are a string concatenation of a three
#'     letter country ID, \code{country_text_id}, and an ISO 8601
#'     date, \code{historical_date}. \code{get_date} is a vectorised
#'     function that returns the date part, converted using
#'     \code{as.Date}.
#'
#' @return Date vector
#'
#' @examples
#' get_date("SWE 1988-04-20")
#'
#' @family country_date functions
#'
#' @export
get_date <- function(x) {
    assert_str(x)
    sub("^\\S{1,3}\\s", "", x, perl = T) %>% as.Date
}

#' @details The \code{get_text_id} function is the vectorised
#'     counterpart to \code{get_date} that returns the
#'     \code{country_text_id} components
#'
#' @examples
#' get_text_id("DEU 1951-02-23")
#'
#' @rdname get_date
#' @export
get_text_id <- function(x)  {
    assert_str(x)
    sub("\\s\\d{4}.*$", "",  x, perl = T) %>% unclass
}

#' Sort a vector of country-dates
#'
#' Sorts a character vector of country-dates based on the
#' \code{country_text_id} prefixes.
#'
#' @param x Character Vector
#' @param decreasing Logical indicating whether to sort by decreasing
#'     order.
#' @param na.last Logical indicating whether NAs should be appending
#'     to the end or beginning of the sorted vector.
#'
#' @details Sorts first country-date strings followed by vignette
#'     identifiers. The fact that we append vignettes to the end is
#'     simply a convenience for when working with the MM code.
#'
#' @examples
#' v <- c("USA 1920-01-01", "MEX 1902-01-01", "MEX 1900-01-01",
#'        "pilot_1_v1", "pilot_2_v1")
#' sort_text_id(v)
#'
#' @family country-date functions
#'
#' @export
sort_text_id <- function(x, decreasing = F, na.last = T){
    b <- grepl("^\\S{3}\\s", x)

    # Assert first that we have valid strings and vignette tags. This
    # sort function is the only `country-date` specific function that
    # should be able to deal with vignette identifiers.
    assert_str(x[b])
    if (any(!is_vignette(x[!b])))
        stop("Invalid vignette string", call. = F)

    out <- c(sort(x[b], decreasing = decreasing), sort(x[!b], decreasing = decreasing))
    if (isTRUE(na.last))
        c(out, x[is.na(x)])
    else
        c(x[is.na(x)], out)
 }

#' Date to year
#'
#' Return the four digit year from a Date object.
#'
#' @param date Date or character
#'
#' @return Integer
#'
#' @examples
#' v <- as.Date(c("1900-01-01", "1901-01-01"))
#' to_year(v)
#'
#' @export
to_year <- function(date) UseMethod("to_year")

#' @export
to_year.Date <- function(date) {
    x <- rep(NA_real_, length(date))

    b <- !is.na(date)
    x[b] <- format(date[b], "%Y") %>% as.integer

    x
}

#' @export
to_year.character <- function(date) {
    x <- rep(NA_real_, length(date))

    b <- !is.na(date)
    x[b] <- substr(date, 1, 4) %>% as.integer

    x
}

#' Gap Index
#'
#' \code{create_idx} returns a grouping index as a numeric vector
#' denoting sequential dates, \emph{i.e.} groups separated by year
#' gaps. The resulting vector can be coerced to a factor for use with
#' \code{split} or \code{dplyr::group_by}.
#'
#' @param x Vector of years or Date objects.
#'
#' @details Where this is most useful is when we fill in missing
#'     observations and we want to make sure we never fill across year
#'     gaps. \code{create_idx} returns a grouping vector that can be
#'     used when splitting before filling.
#'
#'     The actual values of the returned vector are inconsequential
#'     and are only useful for identifying groups.
#'
#'     As an example, V-Dem does not code the occupation of Germany
#'     after WWII. To ensure that values prior to 1946 are not used to
#'     fill in missingness after 1949, the data should be split by
#'     gaps by creating an index column denoting the two groups.
#'
#' @examples
#' dates <- as.Date(c("1900-01-01", "1901-12-31", "1903-01-01"))
#' create_idx(dates)
#'
#' years <- c(1900, 1901, 1905, 1907)
#' create_idx(years)
#'
#' @export
create_idx <- function(x) UseMethod("create_idx")

#' @export
create_idx.default <- function(x) {
    if (any(is.na(x)))
        stop("Invalid input vector contained NA", call. = F)

    if (!is.numeric(x) | any(floor(log10(x)) + 1 != 4))
        stop("Invalid years", call. = F)

    cumsum(c(T, diff(x) > 1))
}

#' @export
create_idx.Date <- function(x) {
    if (any(is.na(x)))
        stop("Invalid input vector contained NA", call. = F)

    cumsum(c(T, diff(x) > 366))
}

#' Expand based on row.names
#'
#' \code{stretch} expands a matrix row-wise by a given character
#' vector of country-dates. For example, it can be used to expand a
#' country-date reduced matrix to the full time series.
#'
#' @param x Matrix to expand.
#' @param by CharacterVector of country-dates to expand by.
#' @param gaps Logical, whether to preserve gaps when expanding.
#' @param preserve.na Logical, whether to preserve NA from the original
#'     matrix, \code{x}.
#' @param interpolate Logical, whether to first interpolate each year
#'     before filling in missing values in the expanded matrix.
#'
#' @details Given a matrix, \code{x}, with country-date rownames and a
#'     character vector of country-dates that should be added
#'     (\emph{i.e.,} expanded to), \code{stretch} will output a matrix
#'     with \code{ncol(x)} columns and \code{length(union(by,
#'     rownames(x)))} rows. The rows shared between the output matrix
#'     and \code{x} will be filled in with the values from \code{x}
#'     before calling \code{\link{locf}}.
#'
#'     Missingness will be filled in by \code{country_text_id}, sorted
#'     by \code{historical_date}. If \code{gaps} is \code{TRUE}, gaps
#'     will be preserved such that if there is more than a 365 day
#'     difference between two observations, the last observation will
#'     not be carried forward.
#'
#'     When \code{preserve.na} is \code{TRUE}, \code{NA} in the
#'     original matrix, \code{x}, will be preserved and propagated in
#'     the output matrix.
#'
#'     If \code{interpolate} is \code{TRUE}, before calling
#'     \code{\link{locf}}, values in the output matrix will be first
#'     interpolated per country-year using the
#'     \code{\link{interpolate}} --- specifically, this means that if
#'     there's a single non-missing observation on "12-31", the year
#'     will first be backfilled.
#'
#' @examples
#' m <- matrix(1:2, 2, 1, dimnames = list(c("AFG 1790-12-31", "AFG 1791-12-31"),
#'                                        NULL))
#'
#' full_names <- c("AFG 1790-12-31", "AFG 1791-05-04",
#'                 "AFG 1791-12-31", "AFG 1795-12-31")
#'
#' stretch(m, full_names)
#'
#' # Expand with interpolation
#' stretch(m, full_names, interpolate = TRUE)
#'
#' # Ignore gaps
#' stretch(m, full_names, gaps = FALSE)
#'
#' @family fill functions
#' @export
stretch <- function(x, by, gaps = TRUE, preserve.na = TRUE, interpolate = FALSE)
    UseMethod("stretch")

#' @export
stretch.matrix <- function(x, by, gaps = TRUE, preserve.na = TRUE, interpolate = FALSE) {
    assert_str(by)
    assert_str(rownames(x))

    if (setequal(rownames(x), by))
        return(x[order(rownames(x)),, drop = F])

    full_names <- union(by, rownames(x))
    full.ma <- matrix(NA, length(by), ncol(x), dimnames = list(full_names, colnames(x)))

    full.ma[match(rownames(x), full_names), ] <- x
    full.ma <- full.ma[sort(rownames(full.ma)),, drop = F]

    # If TRUE we preserve NA from `x` and carry it forward in the
    # full.ma matrix since we assume that it represents missingness
    # that we want retained. Unfortunately, we're running up against
    # the R type system here so we have no meaningful way to
    # distinguish NAs from expanding and NAs originally in `x`. As an
    # ugly hack, replace the NAs from `x` with Inf and convert back in
    # the end.
    if (isTRUE(preserve.na)) {
        b <- rowSums(is.na(x)) == ncol(x)

        if (any(b))
            full.ma[rownames(full.ma) %in% rownames(x)[b], ] <- Inf
    }

    factors <- rownames(full.ma) %>% get_text_id
    dates <- rownames(full.ma) %>% get_date

    if (isTRUE(gaps)) {
        gaps <- split(dates, factors) %>%
            lapply(create_idx) %>%
            unlist

        if (any(is.na(gaps)))
            stop("Unable to set gaps", call. = F)

        factors <- factors %^% gaps
    }

    if (isTRUE(interpolate)) {
        years <- to_year(dates)

        # We really should think about simplifying this
        full.ma <- by_split(full.ma, factors %^% years,
                           methods::getFunction("interpolate"))
        full.ma <- full.ma[order(rownames(full.ma)),, drop = F]
    }

    out <- by_split(full.ma, factors, locf)
    out[is.infinite(out)] <- NA

    out[order(rownames(out)),, drop = F]
}

#' Front-fill an object
#'
#' S3 generic function that front-fills by carrying the last
#' observation forward. For DataFrames and Matrices, this is done
#' column-wise.
#'
#' @param x Object to front fill.
#'
#' @examples
#' v <- c(NA, 1, NA, 2, 2)
#' locf(v)
#'
#' df <- data.frame(x = c(1, NA, NA))
#' (out <- locf(df))
#'
#' # The original object should be preserved
#' identical(df, out)
#'
#' m <- matrix(1:6, 3, 2)
#' m[1, 2] <- NA
#'
#' # This should still be identical to m
#' locf(m)
#'
#' @family fill functions
#' @export
locf <- function(x) UseMethod("locf")

#' Interpolation
#'
#' S3 generic function that back fills an object if the last element
#' is the only non-missing observation. Otherwise, front filling by
#' carrying the last observation forward is done, similar to
#' \code{\link{locf}}. For DataFrames and Matrices, this is all done
#' column-wise.
#'
#' @param x Object to interpolate.
#'
#' @details The differences between \code{locf} and \code{interpolate}
#'     are frustratingly trivial; however, we need the
#'     \code{interpolate} functions in situations where the only
#'     observation is at the default date ("-12-31"), which should
#'     then represent the entire year.
#'
#'     For example, when conforming multiple country-date level
#'     variables, missingness introduced by merging should be first
#'     interpolated before being expanded so as to better reflect the
#'     observation at the default date representing the entire year,
#'     rather than the previous year's default date observation being
#'     carried forward until "-12-30".
#'
#' @examples
#' df <- data.frame(x = c(1, NA, 2),
#'                  y = as.Date(c("1900-12-31", "1901-10-12", "1902-12-31")))
#'
#' transform(df, x = interpolate(x))
#'
#' # For comparison purposes
#' transform(df, x = locf(x))
#'
#' @family fill functions
#' @export
interpolate <- function(x) UseMethod("interpolate")
